Software of the Month Club 2000 October

home *** CD-ROM | disk | FTP | other *** search

/ Software of the Month Club 2000 October / Software of the Month - Ultimate Collection Shareware 277.iso / pc / PROGRAMS / UTILITY / WINLINUX / DATA1.CAB / programs_-_kernel_source / MM / SLAB.C < prev next >

Wrap

C/C++ Source or Header | 1999-09-17 | 57.0 KB | 1,937 lines

/* * linux/mm/slab.c * Written by Mark Hemment, 1996/97. * (markhe@nextd.demon.co.uk) * * 11 April '97. Started multi-threading - markhe * The global cache-chain is protected by the semaphore 'cache_chain_sem'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). * This is a medium-term exclusion lock. * * Each cache has its own lock; 'c_spinlock'. This lock is needed only * when accessing non-constant members of a cache-struct. * Note: 'constant members' are assigned a value in kmem_cache_create() before * the cache is linked into the cache-chain. The values never change, so not * even a multi-reader lock is needed for these members. * The c_spinlock is only ever held for a few cycles. * * To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which * maybe be sleeping and therefore not holding the semaphore/lock), the * c_growing field is used. This also prevents reaping from a cache. * * Note, caches can _never_ be destroyed. When a sub-system (eg module) has * finished with a cache, it can only be shrunk. This leaves the cache empty, * but already enabled for re-use, eg. during a module re-load. * * Notes: * o Constructors/deconstructors are called while the cache-lock * is _not_ held. Therefore they _must_ be threaded. * o Constructors must not attempt to allocate memory from the * same cache that they are a constructor for - infinite loop! * (There is no easy way to trap this.) * o The per-cache locks must be obtained with local-interrupts disabled. * o When compiled with debug support, and an object-verify (upon release) * is request for a cache, the verify-function is called with the cache * lock held. This helps debugging. * o The functions called from try_to_free_page() must not attempt * to allocate memory from a cache which is being grown. * The buffer sub-system might try to allocate memory, via buffer_cachep. * As this pri is passed to the SLAB, and then (if necessary) onto the * gfp() funcs (which avoid calling try_to_free_page()), no deadlock * should happen. * * The positioning of the per-cache lock is tricky. If the lock is * placed on the same h/w cache line as commonly accessed members * the number of L1 cache-line faults is reduced. However, this can * lead to the cache-line ping-ponging between processors when the * lock is in contention (and the common members are being accessed). * Decided to keep it away from common members. * * More fine-graining is possible, with per-slab locks...but this might be * taking fine graining too far, but would have the advantage; * During most allocs/frees no writes occur to the cache-struct. * Therefore a multi-reader/one writer lock could be used (the writer * needed when the slab chain is being link/unlinked). * As we would not have an exclusion lock for the cache-structure, one * would be needed per-slab (for updating s_free ptr, and/or the contents * of s_index). * The above locking would allow parallel operations to different slabs within * the same cache with reduced spinning. * * Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator), * would allow most allocations from the same cache to execute in parallel. * * At present, each engine can be growing a cache. This should be blocked. * * It is not currently 100% safe to examine the page_struct outside of a kernel * or global cli lock. The risk is v. small, and non-fatal. * * Calls to printk() are not 100% safe (the function is not threaded). However, * printk() is only used under an error condition, and the risk is v. small (not * sure if the console write functions 'enjoy' executing multiple contexts in * parallel. I guess they don't...). * Note, for most calls to printk() any held cache-lock is dropped. This is not * always done for text size reasons - having *_unlock() everywhere is bloat. */ /* * An implementation of the Slab Allocator as described in outline in; * UNIX Internals: The New Frontiers by Uresh Vahalia * Pub: Prentice Hall ISBN 0-13-101908-2 * or with a little more detail in; * The Slab Allocator: An Object-Caching Kernel Memory Allocator * Jeff Bonwick (Sun Microsystems). * Presented at: USENIX Summer 1994 Technical Conference */ /* * This implementation deviates from Bonwick's paper as it * does not use a hash-table for large objects, but rather a per slab * index to hold the bufctls. This allows the bufctl structure to * be small (one word), but limits the number of objects a slab (not * a cache) can contain when off-slab bufctls are used. The limit is the * size of the largest general cache that does not use off-slab bufctls, * divided by the size of a bufctl. For 32bit archs, is this 256/4 = 64. * This is not serious, as it is only for large objects, when it is unwise * to have too many per slab. * Note: This limit can be raised by introducing a general cache whose size * is less than 512 (PAGE_SIZE<<3), but greater than 256. */ #include <linux/config.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/init.h> /* If there is a different PAGE_SIZE around, and it works with this allocator, * then change the following. */ #if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096) #error Your page size is probably not correctly supported - please check #endif /* SLAB_MGMT_CHECKS - 1 to enable extra checks in kmem_cache_create(). * 0 if you wish to reduce memory usage. * * SLAB_DEBUG_SUPPORT - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE, * SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller, code (especially in the critical paths). * * SLAB_STATS - 1 to collect stats for /proc/slabinfo. * 0 for faster, smaller, code (especially in the critical paths). * * SLAB_SELFTEST - 1 to perform a few tests, mainly for development. */ #define SLAB_MGMT_CHECKS 1 #define SLAB_DEBUG_SUPPORT 0 #define SLAB_STATS 0 #define SLAB_SELFTEST 0 /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) /* Legal flag mask for kmem_cache_create(). */ #if SLAB_DEBUG_SUPPORT #if 0 #define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \ SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \ SLAB_HIGH_PACK) #endif #define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \ SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP) #else #if 0 #define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK) #endif #define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP) #endif /* SLAB_DEBUG_SUPPORT */ /* Slab management struct. * Manages the objs in a slab. Placed either at the end of mem allocated * for a slab, or from an internal obj cache (cache_slabp). * Slabs are chained into a partially ordered list; fully used first, partial * next, and then fully free slabs. * The first 4 members are referenced during an alloc/free operation, and * should always appear on the same cache line. * Note: The offset between some members _must_ match offsets within * the kmem_cache_t - see kmem_cache_init() for the checks. */ #define SLAB_OFFSET_BITS 16 /* could make this larger for 64bit archs */ typedef struct kmem_slab_s { struct kmem_bufctl_s *s_freep; /* ptr to first inactive obj in slab */ struct kmem_bufctl_s *s_index; unsigned long s_magic; unsigned long s_inuse; /* num of objs active in slab */ struct kmem_slab_s *s_nextp; struct kmem_slab_s *s_prevp; void *s_mem; /* addr of first obj in slab */ unsigned long s_offset:SLAB_OFFSET_BITS, s_dma:1; } kmem_slab_t; /* When the slab management is on-slab, this gives the size to use. */ #define slab_align_size (L1_CACHE_ALIGN(sizeof(kmem_slab_t))) /* Test for end of slab chain. */ #define kmem_slab_end(x) ((kmem_slab_t*)&((x)->c_offset)) /* s_magic */ #define SLAB_MAGIC_ALLOC 0xA5C32F2BUL /* slab is alive */ #define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destroyed */ /* Bufctl's are used for linking objs within a slab, identifying what slab an obj * is in, and the address of the associated obj (for sanity checking with off-slab * bufctls). What a bufctl contains depends upon the state of the obj and * the organisation of the cache. */ typedef struct kmem_bufctl_s { union { struct kmem_bufctl_s *buf_nextp; kmem_slab_t *buf_slabp; /* slab for obj */ void * buf_objp; } u; } kmem_bufctl_t; /* ...shorthand... */ #define buf_nextp u.buf_nextp #define buf_slabp u.buf_slabp #define buf_objp u.buf_objp #if SLAB_DEBUG_SUPPORT /* Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ #define SLAB_RED_MAGIC1 0x5A2CF071UL /* when obj is active */ #define SLAB_RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */ /* ...and for poisoning */ #define SLAB_POISON_BYTE 0x5a /* byte value for poisoning */ #define SLAB_POISON_END 0xa5 /* end-byte of poisoning */ #endif /* SLAB_DEBUG_SUPPORT */ /* Cache struct - manages a cache. * First four members are commonly referenced during an alloc/free operation. */ struct kmem_cache_s { kmem_slab_t *c_freep; /* first slab w. free objs */ unsigned long c_flags; /* constant flags */ unsigned long c_offset; unsigned long c_num; /* # of objs per slab */ unsigned long c_magic; unsigned long c_inuse; /* kept at zero */ kmem_slab_t *c_firstp; /* first slab in chain */ kmem_slab_t *c_lastp; /* last slab in chain */ spinlock_t c_spinlock; unsigned long c_growing; unsigned long c_dflags; /* dynamic flags */ size_t c_org_size; unsigned long c_gfporder; /* order of pgs per slab (2^n) */ void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */ void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */ unsigned long c_align; /* alignment of objs */ size_t c_colour; /* cache colouring range */ size_t c_colour_next;/* cache colouring */ unsigned long c_failures; const char *c_name; struct kmem_cache_s *c_nextp; kmem_cache_t *c_index_cachep; #if SLAB_STATS unsigned long c_num_active; unsigned long c_num_allocations; unsigned long c_high_mark; unsigned long c_grown; unsigned long c_reaped; atomic_t c_errors; #endif /* SLAB_STATS */ }; /* internal c_flags */ #define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */ #define SLAB_CFLGS_BUFCTL 0x020000UL /* bufctls in own cache */ #define SLAB_CFLGS_GENERAL 0x080000UL /* a general cache */ /* c_dflags (dynamic flags). Need to hold the spinlock to access this member */ #define SLAB_CFLGS_GROWN 0x000002UL /* don't reap a recently grown */ #define SLAB_OFF_SLAB(x) ((x) & SLAB_CFLGS_OFF_SLAB) #define SLAB_BUFCTL(x) ((x) & SLAB_CFLGS_BUFCTL) #define SLAB_GROWN(x) ((x) & SLAB_CFLGS_GROWN) #if SLAB_STATS #define SLAB_STATS_INC_ACTIVE(x) ((x)->c_num_active++) #define SLAB_STATS_DEC_ACTIVE(x) ((x)->c_num_active--) #define SLAB_STATS_INC_ALLOCED(x) ((x)->c_num_allocations++) #define SLAB_STATS_INC_GROWN(x) ((x)->c_grown++) #define SLAB_STATS_INC_REAPED(x) ((x)->c_reaped++) #define SLAB_STATS_SET_HIGH(x) do { if ((x)->c_num_active > (x)->c_high_mark) \ (x)->c_high_mark = (x)->c_num_active; \ } while (0) #define SLAB_STATS_INC_ERR(x) (atomic_inc(&(x)->c_errors)) #else #define SLAB_STATS_INC_ACTIVE(x) #define SLAB_STATS_DEC_ACTIVE(x) #define SLAB_STATS_INC_ALLOCED(x) #define SLAB_STATS_INC_GROWN(x) #define SLAB_STATS_INC_REAPED(x) #define SLAB_STATS_SET_HIGH(x) #define SLAB_STATS_INC_ERR(x) #endif /* SLAB_STATS */ #if SLAB_SELFTEST #if !SLAB_DEBUG_SUPPORT #error Debug support needed for self-test #endif static void kmem_self_test(void); #endif /* SLAB_SELFTEST */ /* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */ #define SLAB_C_MAGIC 0x4F17A36DUL /* maximum size of an obj (in 2^order pages) */ #define SLAB_OBJ_MAX_ORDER 5 /* 32 pages */ /* maximum num of pages for a slab (prevents large requests to the VM layer) */ #define SLAB_MAX_GFP_ORDER 5 /* 32 pages */ /* the 'preferred' minimum num of objs per slab - maybe less for large objs */ #define SLAB_MIN_OBJS_PER_SLAB 4 /* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB, * then the page order must be less than this before trying the next order. */ #define SLAB_BREAK_GFP_ORDER_HI 2 #define SLAB_BREAK_GFP_ORDER_LO 1 static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO; /* Macros for storing/retrieving the cachep and or slab from the * global 'mem_map'. With off-slab bufctls, these are used to find the * slab an obj belongs to. With kmalloc(), and kfree(), these are used * to find the cache which an obj belongs to. */ #define SLAB_SET_PAGE_CACHE(pg, x) ((pg)->next = (struct page *)(x)) #define SLAB_GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->next) #define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->prev = (struct page *)(x)) #define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->prev) /* Size description struct for general caches. */ typedef struct cache_sizes { size_t cs_size; kmem_cache_t *cs_cachep; } cache_sizes_t; static cache_sizes_t cache_sizes[] = { #if PAGE_SIZE == 4096 { 32, NULL}, #endif { 64, NULL}, { 128, NULL}, { 256, NULL}, { 512, NULL}, {1024, NULL}, {2048, NULL}, {4096, NULL}, {8192, NULL}, {16384, NULL}, {32768, NULL}, {65536, NULL}, {131072, NULL}, {0, NULL} }; /* Names for the general caches. Not placed into the sizes struct for * a good reason; the string ptr is not needed while searching in kmalloc(), * and would 'get-in-the-way' in the h/w cache. */ static char *cache_sizes_name[] = { #if PAGE_SIZE == 4096 "size-32", #endif "size-64", "size-128", "size-256", "size-512", "size-1024", "size-2048", "size-4096", "size-8192", "size-16384", "size-32768", "size-65536", "size-131072" }; /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { /* freep, flags */ kmem_slab_end(&cache_cache), SLAB_NO_REAP, /* offset, num */ sizeof(kmem_cache_t), 0, /* c_magic, c_inuse */ SLAB_C_MAGIC, 0, /* firstp, lastp */ kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache), /* spinlock */ SPIN_LOCK_UNLOCKED, /* growing */ 0, /* dflags */ 0, /* org_size, gfp */ 0, 0, /* ctor, dtor, align */ NULL, NULL, L1_CACHE_BYTES, /* colour, colour_next */ 0, 0, /* failures */ 0, /* name */ "kmem_cache", /* nextp */ &cache_cache, /* index */ NULL, }; /* Guard access to the cache-chain. */ static struct semaphore cache_chain_sem; /* Place maintainer for reaping. */ static kmem_cache_t *clock_searchp = &cache_cache; /* Internal slab management cache, for when slab management is off-slab. */ static kmem_cache_t *cache_slabp = NULL; /* Max number of objs-per-slab for caches which use bufctl's. * Needed to avoid a possible looping condition in kmem_cache_grow(). */ static unsigned long bufctl_limit = 0; /* Initialisation - setup the `cache' cache. */ long __init kmem_cache_init(long start, long end) { size_t size, i; #define kmem_slab_offset(x) ((unsigned long)&((kmem_slab_t *)0)->x) #define kmem_slab_diff(a,b) (kmem_slab_offset(a) - kmem_slab_offset(b)) #define kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x) #define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b)) /* Sanity checks... */ if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) || kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) || ((kmem_cache_offset(c_lastp) - ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) != kmem_slab_offset(s_prevp)) || kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) { /* Offsets to the magic are incorrect, either the structures have * been incorrectly changed, or adjustments are needed for your * architecture. */ panic("kmem_cache_init(): Offsets are wrong - I've been messed with!"); /* NOTREACHED */ } #undef kmem_cache_offset #undef kmem_cache_diff #undef kmem_slab_offset #undef kmem_slab_diff cache_chain_sem = MUTEX; size = cache_cache.c_offset + sizeof(kmem_bufctl_t); size += (L1_CACHE_BYTES-1); size &= ~(L1_CACHE_BYTES-1); cache_cache.c_offset = size-sizeof(kmem_bufctl_t); i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size; cache_cache.c_num = i / size; /* num of objs per slab */ /* Cache colouring. */ cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES; cache_cache.c_colour_next = cache_cache.c_colour; /* * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ if (num_physpages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI; return start; } /* Initialisation - setup remaining internal and general caches. * Called after the gfp() functions have been enabled, and before smp_init(). */ void __init kmem_cache_sizes_init(void) { unsigned int found = 0; cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (cache_slabp) { char **names = cache_sizes_name; cache_sizes_t *sizes = cache_sizes; do { /* For performance, all the general caches are L1 aligned. * This should be particularly beneficial on SMP boxes, as it * eliminates "false sharing". * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ if (!(sizes->cs_cachep = kmem_cache_create(*names++, sizes->cs_size, 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) goto panic_time; if (!found) { /* Inc off-slab bufctl limit until the ceiling is hit. */ if (SLAB_BUFCTL(sizes->cs_cachep->c_flags)) found++; else bufctl_limit = (sizes->cs_size/sizeof(kmem_bufctl_t)); } sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL; sizes++; } while (sizes->cs_size); #if SLAB_SELFTEST kmem_self_test(); #endif /* SLAB_SELFTEST */ return; } panic_time: panic("kmem_cache_sizes_init: Error creating caches"); /* NOTREACHED */ } /* Interface to system's page allocator. Dma pts to non-zero if all * of memory is DMAable. No need to hold the cache-lock. */ static inline void * kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma) { void *addr; *dma = flags & SLAB_DMA; addr = (void*) __get_free_pages(flags, cachep->c_gfporder); /* Assume that now we have the pages no one else can legally * messes with the 'struct page's. * However vm_scan() might try to test the structure to see if * it is a named-page or buffer-page. The members it tests are * of no interest here..... */ if (!*dma && addr) { /* Need to check if can dma. */ struct page *page = mem_map + MAP_NR(addr); *dma = 1<<cachep->c_gfporder; while ((*dma)--) { if (!PageDMA(page)) { *dma = 0; break; } page++; } } return addr; } /* Interface to system's page release. */ static inline void kmem_freepages(kmem_cache_t *cachep, void *addr) { unsigned long i = (1<<cachep->c_gfporder); struct page *page = &mem_map[MAP_NR(addr)]; /* free_pages() does not clear the type bit - we do that. * The pages have been unlinked from their cache-slab, * but their 'struct page's might be accessed in * vm_scan(). Shouldn't be a worry. */ while (i--) { PageClearSlab(page); page++; } free_pages((unsigned long)addr, cachep->c_gfporder); } #if SLAB_DEBUG_SUPPORT static inline void kmem_poison_obj(kmem_cache_t *cachep, void *addr) { memset(addr, SLAB_POISON_BYTE, cachep->c_org_size); *(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISON_END; } static inline int kmem_check_poison_obj(kmem_cache_t *cachep, void *addr) { void *end; end = memchr(addr, SLAB_POISON_END, cachep->c_org_size); if (end != (addr+cachep->c_org_size-1)) return 1; return 0; } #endif /* SLAB_DEBUG_SUPPORT */ /* Three slab chain funcs - all called with ints disabled and the appropriate * cache-lock held. */ static inline void kmem_slab_unlink(kmem_slab_t *slabp) { kmem_slab_t *prevp = slabp->s_prevp; kmem_slab_t *nextp = slabp->s_nextp; prevp->s_nextp = nextp; nextp->s_prevp = prevp; } static inline void kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp) { kmem_slab_t *lastp = cachep->c_lastp; slabp->s_nextp = kmem_slab_end(cachep); slabp->s_prevp = lastp; cachep->c_lastp = slabp; lastp->s_nextp = slabp; } static inline void kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { kmem_slab_t *nextp = cachep->c_freep; kmem_slab_t *prevp = nextp->s_prevp; slabp->s_nextp = nextp; slabp->s_prevp = prevp; nextp->s_prevp = slabp; slabp->s_prevp->s_nextp = slabp; } /* Destroy all the objs in a slab, and release the mem back to the system. * Before calling the slab must have been unlinked from the cache. * The cache-lock is not held/needed. */ static void kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp) { if (cachep->c_dtor #if SLAB_DEBUG_SUPPORT || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE) #endif /*SLAB_DEBUG_SUPPORT*/ ) { /* Doesn't use the bufctl ptrs to find objs. */ unsigned long num = cachep->c_num; void *objp = slabp->s_mem; do { #if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_RED_ZONE) { if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) printk(KERN_ERR "kmem_slab_destroy: " "Bad front redzone - %s\n", cachep->c_name); objp += BYTES_PER_WORD; if (*((unsigned long*)(objp+cachep->c_org_size)) != SLAB_RED_MAGIC1) printk(KERN_ERR "kmem_slab_destroy: " "Bad rear redzone - %s\n", cachep->c_name); } if (cachep->c_dtor) #endif /*SLAB_DEBUG_SUPPORT*/ (cachep->c_dtor)(objp, cachep, 0); #if SLAB_DEBUG_SUPPORT else if (cachep->c_flags & SLAB_POISON) { if (kmem_check_poison_obj(cachep, objp)) printk(KERN_ERR "kmem_slab_destroy: " "Bad poison - %s\n", cachep->c_name); } if (cachep->c_flags & SLAB_RED_ZONE) objp -= BYTES_PER_WORD; #endif /* SLAB_DEBUG_SUPPORT */ objp += cachep->c_offset; if (!slabp->s_index) objp += sizeof(kmem_bufctl_t); } while (--num); } slabp->s_magic = SLAB_MAGIC_DESTROYED; if (slabp->s_index) kmem_cache_free(cachep->c_index_cachep, slabp->s_index); kmem_freepages(cachep, slabp->s_mem-slabp->s_offset); if (SLAB_OFF_SLAB(cachep->c_flags)) kmem_cache_free(cache_slabp, slabp); } /* Cal the num objs, wastage, and bytes left over for a given slab size. */ static inline size_t kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra, unsigned long flags, size_t *left_over, unsigned long *num) { size_t wastage = PAGE_SIZE<<gfporder; if (SLAB_OFF_SLAB(flags)) gfporder = 0; else gfporder = slab_align_size; wastage -= gfporder; *num = wastage / size; wastage -= (*num * size); *left_over = wastage; return (wastage + gfporder + (extra * *num)); } /* Create a cache: * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * NOTE: The 'name' is assumed to be memory that is _not_ going to disappear. */ kmem_cache_t * kmem_cache_create(const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) { const char *func_nm= KERN_ERR "kmem_create: "; kmem_cache_t *searchp; kmem_cache_t *cachep=NULL; size_t extra; size_t left_over; size_t align; /* Sanity checks... */ #if SLAB_MGMT_CHECKS if (!name) { printk("%sNULL ptr\n", func_nm); goto opps; } if (in_interrupt()) { printk("%sCalled during int - %s\n", func_nm, name); goto opps; } if (size < BYTES_PER_WORD) { printk("%sSize too small %d - %s\n", func_nm, (int) size, name); size = BYTES_PER_WORD; } if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) { printk("%sSize too large %d - %s\n", func_nm, (int) size, name); goto opps; } if (dtor && !ctor) { /* Decon, but no con - doesn't make sense */ printk("%sDecon but no con - %s\n", func_nm, name); goto opps; } if (offset < 0 || offset > size) { printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name); offset = 0; } #if SLAB_DEBUG_SUPPORT if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { /* No constructor, but inital state check requested */ printk("%sNo con, but init state check requested - %s\n", func_nm, name); flags &= ~SLAB_DEBUG_INITIAL; } if ((flags & SLAB_POISON) && ctor) { /* request for poisoning, but we can't do that with a constructor */ printk("%sPoisoning requested, but con given - %s\n", func_nm, name); flags &= ~SLAB_POISON; } #if 0 if ((flags & SLAB_HIGH_PACK) && ctor) { printk("%sHigh pack requested, but con given - %s\n", func_nm, name); flags &= ~SLAB_HIGH_PACK; } if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISON|SLAB_RED_ZONE))) { printk("%sHigh pack requested, but with poisoning/red-zoning - %s\n", func_nm, name); flags &= ~SLAB_HIGH_PACK; } #endif #endif /* SLAB_DEBUG_SUPPORT */ #endif /* SLAB_MGMT_CHECKS */ /* Always checks flags, a caller might be expecting debug * support which isn't available. */ if (flags & ~SLAB_C_MASK) { printk("%sIllgl flg %lX - %s\n", func_nm, flags, name); flags &= SLAB_C_MASK; } /* Get cache's description obj. */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; memset(cachep, 0, sizeof(kmem_cache_t)); /* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD-1)) { size += (BYTES_PER_WORD-1); size &= ~(BYTES_PER_WORD-1); printk("%sForcing size word alignment - %s\n", func_nm, name); } cachep->c_org_size = size; #if SLAB_DEBUG_SUPPORT if (flags & SLAB_RED_ZONE) { /* There is no point trying to honour cache alignment when redzoning. */ flags &= ~SLAB_HWCACHE_ALIGN; size += 2*BYTES_PER_WORD; /* words for redzone */ } #endif /* SLAB_DEBUG_SUPPORT */ align = BYTES_PER_WORD; if (flags & SLAB_HWCACHE_ALIGN) align = L1_CACHE_BYTES; /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */ extra = sizeof(kmem_bufctl_t); if (size < (PAGE_SIZE>>3)) { /* Size is small(ish). Use packing where bufctl size per * obj is low, and slab management is on-slab. */ #if 0 if ((flags & SLAB_HIGH_PACK)) { /* Special high packing for small objects * (mainly for vm_mapping structs, but * others can use it). */ if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) || size == L1_CACHE_BYTES) { /* The bufctl is stored with the object. */ extra = 0; } else flags &= ~SLAB_HIGH_PACK; } #endif } else { /* Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= SLAB_CFLGS_OFF_SLAB; if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2) || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) { /* To avoid waste the bufctls are off-slab... */ flags |= SLAB_CFLGS_BUFCTL; extra = 0; } /* else slab management is off-slab, but freelist pointers are on. */ } size += extra; if (flags & SLAB_HWCACHE_ALIGN) { /* Need to adjust size so that objs are cache aligned. */ if (size > (L1_CACHE_BYTES/2)) { size_t words = size % L1_CACHE_BYTES; if (words) size += (L1_CACHE_BYTES-words); } else { /* Small obj size, can get at least two per cache line. */ int num_per_line = L1_CACHE_BYTES/size; left_over = L1_CACHE_BYTES - (num_per_line*size); if (left_over) { /* Need to adjust size so objs cache align. */ if (left_over%num_per_line) { /* Odd num of objs per line - fixup. */ num_per_line--; left_over += size; } size += (left_over/num_per_line); } } } else if (!(size%L1_CACHE_BYTES)) { /* Size happens to cache align... */ flags |= SLAB_HWCACHE_ALIGN; align = L1_CACHE_BYTES; } /* Cal size (in pages) of slabs, and the num of objs per slab. * This could be made much more intelligent. For now, try to avoid * using high page-orders for slabs. When the gfp() funcs are more * friendly towards high-order requests, this should be changed. */ do { size_t wastage; unsigned int break_flag = 0; cal_wastage: wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra, flags, &left_over, &cachep->c_num); if (!cachep->c_num) goto next; if (break_flag) break; if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) { /* Oops, this num of objs will cause problems. */ cachep->c_gfporder--; break_flag++; goto cal_wastage; } if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER) break; /* Large num of objs is good, but v. large slabs are currently * bad for the gfp()s. */ if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) { if (cachep->c_gfporder < slab_break_gfp_order) goto next; } /* Stop caches with small objs having a large num of pages. */ if (left_over <= slab_align_size) break; if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder)) break; /* Acceptable internal fragmentation. */ next: cachep->c_gfporder++; } while (1); /* If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) && left_over >= slab_align_size) { flags &= ~SLAB_CFLGS_OFF_SLAB; left_over -= slab_align_size; } /* Offset must be a factor of the alignment. */ offset += (align-1); offset &= ~(align-1); /* Mess around with the offset alignment. */ if (!left_over) { offset = 0; } else if (left_over < offset) { offset = align; if (flags & SLAB_HWCACHE_ALIGN) { if (left_over < offset) offset = 0; } else { /* Offset is BYTES_PER_WORD, and left_over is at * least BYTES_PER_WORD. */ if (left_over >= (BYTES_PER_WORD*2)) { offset >>= 1; if (left_over >= (BYTES_PER_WORD*4)) offset >>= 1; } } } else if (!offset) { /* No offset requested, but space enough - give one. */ offset = left_over/align; if (flags & SLAB_HWCACHE_ALIGN) { if (offset >= 8) { /* A large number of colours - use a larger alignment. */ align <<= 1; } } else { if (offset >= 10) { align <<= 1; if (offset >= 16) align <<= 1; } } offset = align; } #if 0 printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size); #endif if ((cachep->c_align = (unsigned long) offset)) cachep->c_colour = (left_over/offset); cachep->c_colour_next = cachep->c_colour; /* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */ if (!SLAB_BUFCTL(flags)) size -= sizeof(kmem_bufctl_t); else cachep->c_index_cachep = kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t)); cachep->c_offset = (unsigned long) size; cachep->c_freep = kmem_slab_end(cachep); cachep->c_firstp = kmem_slab_end(cachep); cachep->c_lastp = kmem_slab_end(cachep); cachep->c_flags = flags; cachep->c_ctor = ctor; cachep->c_dtor = dtor; cachep->c_magic = SLAB_C_MAGIC; cachep->c_name = name; /* Simply point to the name. */ spin_lock_init(&cachep->c_spinlock); /* Need the semaphore to access the chain. */ down(&cache_chain_sem); searchp = &cache_cache; do { /* The name field is constant - no lock needed. */ if (!strcmp(searchp->c_name, name)) { printk("%sDup name - %s\n", func_nm, name); break; } searchp = searchp->c_nextp; } while (searchp != &cache_cache); /* There is no reason to lock our new cache before we * link it in - no one knows about it yet... */ cachep->c_nextp = cache_cache.c_nextp; cache_cache.c_nextp = cachep; up(&cache_chain_sem); opps: return cachep; } /* Shrink a cache. Releases as many slabs as possible for a cache. * It is expected this function will be called by a module when it is * unloaded. The cache is _not_ removed, this creates too many problems and * the cache-structure does not take up much room. A module should keep its * cache pointer(s) in unloaded memory, so when reloaded it knows the cache * is available. To help debugging, a zero exit status indicates all slabs * were released. */ int kmem_cache_shrink(kmem_cache_t *cachep) { kmem_cache_t *searchp; kmem_slab_t *slabp; int ret; if (!cachep) { printk(KERN_ERR "kmem_shrink: NULL ptr\n"); return 2; } if (in_interrupt()) { printk(KERN_ERR "kmem_shrink: Called during int - %s\n", cachep->c_name); return 2; } /* Find the cache in the chain of caches. */ down(&cache_chain_sem); /* Semaphore is needed. */ searchp = &cache_cache; for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) { if (searchp->c_nextp != cachep) continue; /* Accessing clock_searchp is safe - we hold the mutex. */ if (cachep == clock_searchp) clock_searchp = cachep->c_nextp; goto found; } up(&cache_chain_sem); printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep); return 2; found: /* Release the semaphore before getting the cache-lock. This could * mean multiple engines are shrinking the cache, but so what. */ up(&cache_chain_sem); spin_lock_irq(&cachep->c_spinlock); /* If the cache is growing, stop shrinking. */ while (!cachep->c_growing) { slabp = cachep->c_lastp; if (slabp->s_inuse || slabp == kmem_slab_end(cachep)) break; kmem_slab_unlink(slabp); spin_unlock_irq(&cachep->c_spinlock); kmem_slab_destroy(cachep, slabp); spin_lock_irq(&cachep->c_spinlock); } ret = 1; if (cachep->c_lastp == kmem_slab_end(cachep)) ret--; /* Cache is empty. */ spin_unlock_irq(&cachep->c_spinlock); return ret; } /* Get the memory for a slab management obj. */ static inline kmem_slab_t * kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags) { kmem_slab_t *slabp; if (SLAB_OFF_SLAB(cachep->c_flags)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc(cache_slabp, local_flags); } else { /* Slab management at end of slab memory, placed so that * the position is 'coloured'. */ void *end; end = objp + (cachep->c_num * cachep->c_offset); if (!SLAB_BUFCTL(cachep->c_flags)) end += (cachep->c_num * sizeof(kmem_bufctl_t)); slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end); } if (slabp) { slabp->s_inuse = 0; slabp->s_dma = 0; slabp->s_index = NULL; } return slabp; } static inline void kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp, unsigned long ctor_flags) { kmem_bufctl_t **bufpp = &slabp->s_freep; unsigned long num = cachep->c_num-1; do { #if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_RED_ZONE) { *((unsigned long*)(objp)) = SLAB_RED_MAGIC1; objp += BYTES_PER_WORD; *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1; } #endif /* SLAB_DEBUG_SUPPORT */ /* Constructors are not allowed to allocate memory from the same cache * which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ if (cachep->c_ctor) cachep->c_ctor(objp, cachep, ctor_flags); #if SLAB_DEBUG_SUPPORT else if (cachep->c_flags & SLAB_POISON) { /* need to poison the objs */ kmem_poison_obj(cachep, objp); } if (cachep->c_flags & SLAB_RED_ZONE) { if (*((unsigned long*)(objp+cachep->c_org_size)) != SLAB_RED_MAGIC1) { *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1; printk(KERN_ERR "kmem_init_obj: Bad rear redzone " "after constructor - %s\n", cachep->c_name); } objp -= BYTES_PER_WORD; if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) { *((unsigned long*)(objp)) = SLAB_RED_MAGIC1; printk(KERN_ERR "kmem_init_obj: Bad front redzone " "after constructor - %s\n", cachep->c_name); } } #endif /* SLAB_DEBUG_SUPPORT */ objp += cachep->c_offset; if (!slabp->s_index) { *bufpp = objp; objp += sizeof(kmem_bufctl_t); } else *bufpp = &slabp->s_index[num]; bufpp = &(*bufpp)->buf_nextp; } while (num--); *bufpp = NULL; } /* Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ static int kmem_cache_grow(kmem_cache_t * cachep, int flags) { kmem_slab_t *slabp; struct page *page; void *objp; size_t offset; unsigned int dma, local_flags; unsigned long ctor_flags; unsigned long save_flags; /* Be lazy and only check for valid flags here, * keeping it out of the critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) { printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n", flags, cachep->c_name); flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW); } if (flags & SLAB_NO_GROW) return 0; /* The test for missing atomic flag is performed here, rather than * the more obvious place, simply to reduce the critical path length * in kmem_cache_alloc(). If a caller is slightly mis-behaving they * will eventually be caught here (where it matters). */ if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) { printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n", cachep->c_name); flags &= ~SLAB_LEVEL_MASK; flags |= SLAB_ATOMIC; } ctor_flags = SLAB_CTOR_CONSTRUCTOR; local_flags = (flags & SLAB_LEVEL_MASK); if (local_flags == SLAB_ATOMIC) { /* Not allowed to sleep. Need to tell a constructor about * this - it might need to know... */ ctor_flags |= SLAB_CTOR_ATOMIC; } /* About to mess with non-constant members - lock. */ spin_lock_irqsave(&cachep->c_spinlock, save_flags); /* Get colour for the slab, and cal the next value. */ if (!(offset = cachep->c_colour_next--)) cachep->c_colour_next = cachep->c_colour; offset *= cachep->c_align; cachep->c_dflags = SLAB_CFLGS_GROWN; cachep->c_growing++; spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); /* A series of memory allocations for a new slab. * Neither the cache-chain semaphore, or cache-lock, are * held, but the incrementing c_growing prevents this * this cache from being reaped or shrunk. * Note: The cache could be selected in for reaping in * kmem_cache_reap(), but when the final test is made the * growing value will be seen. */ /* Get mem for the objs. */ if (!(objp = kmem_getpages(cachep, flags, &dma))) goto failed; /* Get slab management. */ if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags))) goto opps1; if (dma) slabp->s_dma = 1; if (SLAB_BUFCTL(cachep->c_flags)) { slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags); if (!slabp->s_index) goto opps2; } /* Nasty!!!!!! I hope this is OK. */ dma = 1 << cachep->c_gfporder; page = &mem_map[MAP_NR(objp)]; do { SLAB_SET_PAGE_CACHE(page, cachep); SLAB_SET_PAGE_SLAB(page, slabp); PageSetSlab(page); page++; } while (--dma); slabp->s_offset = offset; /* It will fit... */ objp += offset; /* Address of first object. */ slabp->s_mem = objp; /* For on-slab bufctls, c_offset is the distance between the start of * an obj and its related bufctl. For off-slab bufctls, c_offset is * the distance between objs in the slab. */ kmem_cache_init_objs(cachep, slabp, objp, ctor_flags); spin_lock_irq(&cachep->c_spinlock); /* Make slab active. */ slabp->s_magic = SLAB_MAGIC_ALLOC; kmem_slab_link_end(cachep, slabp); if (cachep->c_freep == kmem_slab_end(cachep)) cachep->c_freep = slabp; SLAB_STATS_INC_GROWN(cachep); cachep->c_failures = 0; cachep->c_growing--; spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); return 1; opps2: if (SLAB_OFF_SLAB(cachep->c_flags)) kmem_cache_free(cache_slabp, slabp); opps1: kmem_freepages(cachep, objp); failed: spin_lock_irq(&cachep->c_spinlock); cachep->c_growing--; spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); return 0; } static void kmem_report_alloc_err(const char *str, kmem_cache_t * cachep) { if (cachep) SLAB_STATS_INC_ERR(cachep); /* this is atomic */ printk(KERN_ERR "kmem_alloc: %s (name=%s)\n", str, cachep ? cachep->c_name : "unknown"); } static void kmem_report_free_err(const char *str, const void *objp, kmem_cache_t * cachep) { if (cachep) SLAB_STATS_INC_ERR(cachep); printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n", str, objp, cachep ? cachep->c_name : "unknown"); } /* Search for a slab whose objs are suitable for DMA. * Note: since testing the first free slab (in __kmem_cache_alloc()), * ints must not have been enabled, or the cache-lock released! */ static inline kmem_slab_t * kmem_cache_search_dma(kmem_cache_t * cachep) { kmem_slab_t *slabp = cachep->c_freep->s_nextp; for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) { if (!(slabp->s_dma)) continue; kmem_slab_unlink(slabp); kmem_slab_link_free(cachep, slabp); cachep->c_freep = slabp; break; } return slabp; } #if SLAB_DEBUG_SUPPORT /* Perform extra freeing checks. Currently, this check is only for caches * that use bufctl structures within the slab. Those which use bufctl's * from the internal cache have a reasonable check when the address is * searched for. Called with the cache-lock held. */ static void * kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp, kmem_bufctl_t *bufp, void * objp) { if (SLAB_BUFCTL(cachep->c_flags)) return objp; /* Check slab's freelist to see if this obj is there. */ for (; search_bufp; search_bufp = search_bufp->buf_nextp) { if (search_bufp != bufp) continue; return NULL; } return objp; } #endif /* SLAB_DEBUG_SUPPORT */ /* Called with cache lock held. */ static inline void kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { if (slabp->s_nextp->s_inuse) { /* Not at correct position. */ if (cachep->c_freep == slabp) cachep->c_freep = slabp->s_nextp; kmem_slab_unlink(slabp); kmem_slab_link_end(cachep, slabp); } } /* Called with cache lock held. */ static inline void kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp) { if (slabp->s_nextp->s_inuse == cachep->c_num) { kmem_slab_unlink(slabp); kmem_slab_link_free(cachep, slabp); } cachep->c_freep = slabp; } /* Returns a ptr to an obj in the given cache. */ static inline void * __kmem_cache_alloc(kmem_cache_t *cachep, int flags) { kmem_slab_t *slabp; kmem_bufctl_t *bufp; void *objp; unsigned long save_flags; /* Sanity check. */ if (!cachep) goto nul_ptr; spin_lock_irqsave(&cachep->c_spinlock, save_flags); try_again: /* Get slab alloc is to come from. */ slabp = cachep->c_freep; /* Magic is a sanity check _and_ says if we need a new slab. */ if (slabp->s_magic != SLAB_MAGIC_ALLOC) goto alloc_new_slab; /* DMA requests are 'rare' - keep out of the critical path. */ if (flags & SLAB_DMA) goto search_dma; try_again_dma: SLAB_STATS_INC_ALLOCED(cachep); SLAB_STATS_INC_ACTIVE(cachep); SLAB_STATS_SET_HIGH(cachep); slabp->s_inuse++; bufp = slabp->s_freep; slabp->s_freep = bufp->buf_nextp; if (slabp->s_freep) { ret_obj: if (!slabp->s_index) { bufp->buf_slabp = slabp; objp = ((void*)bufp) - cachep->c_offset; finished: /* The lock is not needed by the red-zone or poison ops, and the * obj has been removed from the slab. Should be safe to drop * the lock here. */ spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); #if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_RED_ZONE) goto red_zone; ret_red: if ((cachep->c_flags & SLAB_POISON) && kmem_check_poison_obj(cachep, objp)) kmem_report_alloc_err("Bad poison", cachep); #endif /* SLAB_DEBUG_SUPPORT */ return objp; } /* Update index ptr. */ objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem; bufp->buf_objp = objp; goto finished; } cachep->c_freep = slabp->s_nextp; goto ret_obj; #if SLAB_DEBUG_SUPPORT red_zone: /* Set alloc red-zone, and check old one. */ if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1) kmem_report_alloc_err("Bad front redzone", cachep); objp += BYTES_PER_WORD; if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1) kmem_report_alloc_err("Bad rear redzone", cachep); goto ret_red; #endif /* SLAB_DEBUG_SUPPORT */ search_dma: if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep)) goto try_again_dma; alloc_new_slab: /* Either out of slabs, or magic number corruption. */ if (slabp == kmem_slab_end(cachep)) { /* Need a new slab. Release the lock before calling kmem_cache_grow(). * This allows objs to be released back into the cache while growing. */ spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); if (kmem_cache_grow(cachep, flags)) { /* Someone may have stolen our objs. Doesn't matter, we'll * just come back here again. */ spin_lock_irq(&cachep->c_spinlock); goto try_again; } /* Couldn't grow, but some objs may have been freed. */ spin_lock_irq(&cachep->c_spinlock); if (cachep->c_freep != kmem_slab_end(cachep)) { if ((flags & SLAB_ATOMIC) == 0) goto try_again; } } else { /* Very serious error - maybe panic() here? */ kmem_report_alloc_err("Bad slab magic (corrupt)", cachep); } spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); err_exit: return NULL; nul_ptr: kmem_report_alloc_err("NULL ptr", NULL); goto err_exit; } /* Release an obj back to its cache. If the obj has a constructed state, * it should be in this state _before_ it is released. */ static inline void __kmem_cache_free(kmem_cache_t *cachep, const void *objp) { kmem_slab_t *slabp; kmem_bufctl_t *bufp; unsigned long save_flags; /* Basic sanity checks. */ if (!cachep || !objp) goto null_addr; #if SLAB_DEBUG_SUPPORT /* A verify func is called without the cache-lock held. */ if (cachep->c_flags & SLAB_DEBUG_INITIAL) goto init_state_check; finished_initial: if (cachep->c_flags & SLAB_RED_ZONE) goto red_zone; return_red: #endif /* SLAB_DEBUG_SUPPORT */ spin_lock_irqsave(&cachep->c_spinlock, save_flags); if (SLAB_BUFCTL(cachep->c_flags)) goto bufctl; bufp = (kmem_bufctl_t *)(objp+cachep->c_offset); /* Get slab for the object. */ #if 0 /* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects. * Is this worth while? XXX */ if (cachep->c_flags & SLAB_HIGH_PACK) slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]); else #endif slabp = bufp->buf_slabp; check_magic: if (slabp->s_magic != SLAB_MAGIC_ALLOC) /* Sanity check. */ goto bad_slab; #if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_DEBUG_FREE) goto extra_checks; passed_extra: #endif /* SLAB_DEBUG_SUPPORT */ if (slabp->s_inuse) { /* Sanity check. */ SLAB_STATS_DEC_ACTIVE(cachep); slabp->s_inuse--; bufp->buf_nextp = slabp->s_freep; slabp->s_freep = bufp; if (bufp->buf_nextp) { if (slabp->s_inuse) { /* (hopefully) The most common case. */ finished: #if SLAB_DEBUG_SUPPORT if (cachep->c_flags & SLAB_POISON) { if (cachep->c_flags & SLAB_RED_ZONE) objp += BYTES_PER_WORD; kmem_poison_obj(cachep, objp); } #endif /* SLAB_DEBUG_SUPPORT */ spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); return; } kmem_cache_full_free(cachep, slabp); goto finished; } kmem_cache_one_free(cachep, slabp); goto finished; } /* Don't add to freelist. */ spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); kmem_report_free_err("free with no active objs", objp, cachep); return; bufctl: /* No 'extra' checks are performed for objs stored this way, finding * the obj is check enough. */ slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]); bufp = &slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset]; if (bufp->buf_objp == objp) goto check_magic; spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); kmem_report_free_err("Either bad obj addr or double free", objp, cachep); return; #if SLAB_DEBUG_SUPPORT init_state_check: /* Need to call the slab's constructor so the * caller can perform a verify of its state (debugging). */ cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); goto finished_initial; extra_checks: if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) { spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); kmem_report_free_err("Double free detected during checks", objp, cachep); return; } goto passed_extra; red_zone: /* We do not hold the cache-lock while checking the red-zone. */ objp -= BYTES_PER_WORD; if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) { /* Either write before start of obj, or a double free. */ kmem_report_free_err("Bad front redzone", objp, cachep); } if (xchg((unsigned long *)(objp+cachep->c_org_size+BYTES_PER_WORD), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) { /* Either write past end of obj, or a double free. */ kmem_report_free_err("Bad rear redzone", objp, cachep); } goto return_red; #endif /* SLAB_DEBUG_SUPPORT */ bad_slab: /* Slab doesn't contain the correct magic num. */ if (slabp->s_magic == SLAB_MAGIC_DESTROYED) { /* Magic num says this is a destroyed slab. */ kmem_report_free_err("free from inactive slab", objp, cachep); } else kmem_report_free_err("Bad obj addr", objp, cachep); spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); #if 1 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */ *(int *) 0 = 0; #endif return; null_addr: kmem_report_free_err("NULL ptr", objp, cachep); return; } void * kmem_cache_alloc(kmem_cache_t *cachep, int flags) { return __kmem_cache_alloc(cachep, flags); } void kmem_cache_free(kmem_cache_t *cachep, void *objp) { __kmem_cache_free(cachep, objp); } void * kmalloc(size_t size, int flags) { cache_sizes_t *csizep = cache_sizes; for (; csizep->cs_size; csizep++) { if (size > csizep->cs_size) continue; return __kmem_cache_alloc(csizep->cs_cachep, flags); } printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size); return NULL; } void kfree(const void *objp) { struct page *page; int nr; if (!objp) goto null_ptr; nr = MAP_NR(objp); if (nr >= max_mapnr) goto bad_ptr; /* Assume we own the page structure - hence no locking. * If someone is misbehaving (for example, calling us with a bad * address), then access to the page structure can race with the * kmem_slab_destroy() code. Need to add a spin_lock to each page * structure, which would be useful in threading the gfp() functions.... */ page = &mem_map[nr]; if (PageSlab(page)) { kmem_cache_t *cachep; /* Here, we again assume the obj address is good. * If it isn't, and happens to map onto another * general cache page which has no active objs, then * we race. */ cachep = SLAB_GET_PAGE_CACHE(page); if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) { __kmem_cache_free(cachep, objp); return; } } bad_ptr: printk(KERN_ERR "kfree: Bad obj %p\n", objp); #if 1 /* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */ *(int *) 0 = 0; #endif null_ptr: return; } void kfree_s(const void *objp, size_t size) { struct page *page; int nr; if (!objp) goto null_ptr; nr = MAP_NR(objp); if (nr >= max_mapnr) goto null_ptr; /* See comment in kfree() */ page = &mem_map[nr]; if (PageSlab(page)) { kmem_cache_t *cachep; /* See comment in kfree() */ cachep = SLAB_GET_PAGE_CACHE(page); if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) { if (size <= cachep->c_org_size) { /* XXX better check */ __kmem_cache_free(cachep, objp); return; } } } null_ptr: printk(KERN_ERR "kfree_s: Bad obj %p\n", objp); return; } kmem_cache_t * kmem_find_general_cachep(size_t size) { cache_sizes_t *csizep = cache_sizes; /* This function could be moved to the header file, and * made inline so consumers can quickly determine what * cache pointer they require. */ for (; csizep->cs_size; csizep++) { if (size > csizep->cs_size) continue; break; } return csizep->cs_cachep; } /* Called from try_to_free_page(). * This function _cannot_ be called within a int, but it * can be interrupted. */ void kmem_cache_reap(int gfp_mask) { kmem_slab_t *slabp; kmem_cache_t *searchp; kmem_cache_t *best_cachep; unsigned int scan; unsigned int reap_level; if (in_interrupt()) { printk("kmem_cache_reap() called within int!\n"); return; } /* We really need a test semaphore op so we can avoid sleeping when * !wait is true. */ down(&cache_chain_sem); scan = 10; reap_level = 0; best_cachep = NULL; searchp = clock_searchp; do { unsigned int full_free; unsigned int dma_flag; /* It's safe to test this without holding the cache-lock. */ if (searchp->c_flags & SLAB_NO_REAP) goto next; spin_lock_irq(&searchp->c_spinlock); if (searchp->c_growing) goto next_unlock; if (searchp->c_dflags & SLAB_CFLGS_GROWN) { searchp->c_dflags &= ~SLAB_CFLGS_GROWN; goto next_unlock; } /* Sanity check for corruption of static values. */ if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) { spin_unlock_irq(&searchp->c_spinlock); printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name); goto next; } dma_flag = 0; full_free = 0; /* Count the fully free slabs. There should not be not many, * since we are holding the cache lock. */ slabp = searchp->c_lastp; while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) { slabp = slabp->s_prevp; full_free++; if (slabp->s_dma) dma_flag++; } spin_unlock_irq(&searchp->c_spinlock); if ((gfp_mask & GFP_DMA) && !dma_flag) goto next; if (full_free) { if (full_free >= 10) { best_cachep = searchp; break; } /* Try to avoid slabs with constructors and/or * more than one page per slab (as it can be difficult * to get high orders from gfp()). */ if (full_free >= reap_level) { reap_level = full_free; best_cachep = searchp; } } goto next; next_unlock: spin_unlock_irq(&searchp->c_spinlock); next: searchp = searchp->c_nextp; } while (--scan && searchp != clock_searchp); clock_searchp = searchp; up(&cache_chain_sem); if (!best_cachep) { /* couldn't find anything to reap */ return; } spin_lock_irq(&best_cachep->c_spinlock); while (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) { if (gfp_mask & GFP_DMA) { do { if (slabp->s_dma) goto good_dma; slabp = slabp->s_prevp; } while (!slabp->s_inuse && slabp != kmem_slab_end(best_cachep)); /* Didn't found a DMA slab (there was a free one - * must have been become active). */ goto dma_fail; good_dma: } if (slabp == best_cachep->c_freep) best_cachep->c_freep = slabp->s_nextp; kmem_slab_unlink(slabp); SLAB_STATS_INC_REAPED(best_cachep); /* Safe to drop the lock. The slab is no longer linked to the * cache. */ spin_unlock_irq(&best_cachep->c_spinlock); kmem_slab_destroy(best_cachep, slabp); spin_lock_irq(&best_cachep->c_spinlock); } dma_fail: spin_unlock_irq(&best_cachep->c_spinlock); return; } #if SLAB_SELFTEST /* A few v. simple tests */ static void kmem_self_test(void) { kmem_cache_t *test_cachep; printk(KERN_INFO "kmem_test() - start\n"); test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISON, NULL, NULL); if (test_cachep) { char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL); if (objp) { /* Write in front and past end, red-zone test. */ *(objp-1) = 1; *(objp+16) = 1; kmem_cache_free(test_cachep, objp); /* Mess up poisoning. */ *objp = 10; objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL); kmem_cache_free(test_cachep, objp); /* Mess up poisoning (again). */ *objp = 10; kmem_cache_shrink(test_cachep); } } printk(KERN_INFO "kmem_test() - finished\n"); } #endif /* SLAB_SELFTEST */ #if defined(CONFIG_PROC_FS) /* /proc/slabinfo * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab */ int get_slabinfo(char *buf) { kmem_cache_t *cachep; kmem_slab_t *slabp; unsigned long active_objs; unsigned long save_flags; unsigned long num_slabs; unsigned long num_objs; int len=0; #if SLAB_STATS unsigned long active_slabs; #endif /* SLAB_STATS */ __save_flags(save_flags); /* Output format version, so at least we can change it without _too_ * many complaints. */ #if SLAB_STATS len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n"); #else len = sprintf(buf, "slabinfo - version: 1.0\n"); #endif /* SLAB_STATS */ down(&cache_chain_sem); cachep = &cache_cache; do { #if SLAB_STATS active_slabs = 0; #endif /* SLAB_STATS */ num_slabs = active_objs = 0; spin_lock_irq(&cachep->c_spinlock); for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) { active_objs += slabp->s_inuse; num_slabs++; #if SLAB_STATS if (slabp->s_inuse) active_slabs++; #endif /* SLAB_STATS */ } num_objs = cachep->c_num*num_slabs; #if SLAB_STATS { unsigned long errors; unsigned long high = cachep->c_high_mark; unsigned long grown = cachep->c_grown; unsigned long reaped = cachep->c_reaped; unsigned long allocs = cachep->c_num_allocations; errors = (unsigned long) atomic_read(&cachep->c_errors); spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); len += sprintf(buf+len, "%-16s %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n", cachep->c_name, active_objs, num_objs, active_slabs, num_slabs, (1<<cachep->c_gfporder)*num_slabs, high, allocs, grown, reaped, errors); } #else spin_unlock_irqrestore(&cachep->c_spinlock, save_flags); len += sprintf(buf+len, "%-17s %6lu %6lu\n", cachep->c_name, active_objs, num_objs); #endif /* SLAB_STATS */ } while ((cachep = cachep->c_nextp) != &cache_cache); up(&cache_chain_sem); return len; } #endif /* CONFIG_PROC_FS */